In [1]:
#!pip install -U kaleido

Import des librairies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 100)

import missingno as msno
import plotly.graph_objects as go
import plotly.express as px

from plotly.subplots import make_subplots

Import des datasets

In [3]:
df_customers = pd.read_csv('Dataset/olist_customers_dataset.csv')
df_orders = pd.read_csv('Dataset/olist_orders_dataset.csv')
df_order_items = pd.read_csv('Dataset/olist_order_items_dataset.csv')
df_order_payments = pd.read_csv('Dataset/olist_order_payments_dataset.csv')
df_order_reviews = pd.read_csv('Dataset/olist_order_reviews_dataset.csv')
df_products = pd.read_csv('Dataset/olist_products_dataset.csv')
df_trad = pd.read_csv('Dataset/product_category_name_translation.csv')
df_sellers = pd.read_csv('Dataset/olist_sellers_dataset.csv')
df_geoloc = pd.read_csv('Dataset/olist_geolocation_dataset.csv')

Description des jeu de données

In [4]:
datasets_dict = {"customers": df_customers, "orders": df_orders, 
                 "order_items": df_order_items, "order_payments": df_order_payments, 
                 "order_reviews": df_order_reviews,"products": df_products,
                 "sellers":df_sellers,"trad": df_trad, "geoloc": df_geoloc }
In [5]:
for item in datasets_dict:
    print("**")
    print("Le jeu de données %s contient %d lignes et %d colonnes." % (item, datasets_dict[item].shape[0], datasets_dict[item].shape[1]))
**
Le jeu de données customers contient 99441 lignes et 5 colonnes.
**
Le jeu de données orders contient 99441 lignes et 8 colonnes.
**
Le jeu de données order_items contient 112650 lignes et 7 colonnes.
**
Le jeu de données order_payments contient 103886 lignes et 5 colonnes.
**
Le jeu de données order_reviews contient 99224 lignes et 7 colonnes.
**
Le jeu de données products contient 32951 lignes et 9 colonnes.
**
Le jeu de données sellers contient 3095 lignes et 4 colonnes.
**
Le jeu de données trad contient 71 lignes et 2 colonnes.
**
Le jeu de données geoloc contient 1000163 lignes et 5 colonnes.
In [6]:
for item in datasets_dict:
    print("###########################################################")
    print(item)
    print(datasets_dict[item].columns)
###########################################################
customers
Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state'],
      dtype='object')
###########################################################
orders
Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date'],
      dtype='object')
###########################################################
order_items
Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value'],
      dtype='object')
###########################################################
order_payments
Index(['order_id', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value'],
      dtype='object')
###########################################################
order_reviews
Index(['review_id', 'order_id', 'review_score', 'review_comment_title',
       'review_comment_message', 'review_creation_date',
       'review_answer_timestamp'],
      dtype='object')
###########################################################
products
Index(['product_id', 'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'],
      dtype='object')
###########################################################
sellers
Index(['seller_id', 'seller_zip_code_prefix', 'seller_city', 'seller_state'], dtype='object')
###########################################################
trad
Index(['product_category_name', 'product_category_name_english'], dtype='object')
###########################################################
geoloc
Index(['geolocation_zip_code_prefix', 'geolocation_lat', 'geolocation_lng',
       'geolocation_city', 'geolocation_state'],
      dtype='object')
  • olist_customers_dataset.csv contient les informations clients.
  • olist_geolocation_dataset.csv contient les informations sur la géolocalisation
  • olist_order_items_dataset.csv contient les informations sur les items
  • olist_order_payments_dataset.csv contient les informations sur les paiements
  • olist_order_reviews_dataset.csv contient les informations l'évaluation des produits
  • olist_orders_dataset.csv contient les informations sur les achats
  • olist_products_dataset.csv contient les informations sur les produits
  • olist_sellers_dataset.csv contient les information sur les vendeurs
  • product_category_name_translation.csv contient les informations sur les catégories des produits
Dans les tables, on peut voir que
- "df_orders" est reliée à la table "df_customers" par la clé "customer_id".  
- "order_items", "order_payments" et "order_reviews" sont relié à "orders" par la clé "order_id"
- "products" et "trad" sont relié par "product_category_name"
- "order_items" et "products" sont relié par "product_id"
- "order_items" et ""sellers" sont relié par "seller_id"

On va donc les fusionner.

Visualisation de valeurs manquantes

In [7]:
for df in datasets_dict:
  msno.matrix(datasets_dict[df])
  plt.title("Répartition de valeur manquantes {}".format(df))
  plt.xlabel("Nom des colonnes")
  plt.ylabel("Nombre de lignes")

Customers

In [8]:
df_customers.shape
Out[8]:
(99441, 5)
In [9]:
df_customers.head()
Out[9]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP
In [10]:
df_customers.shape
Out[10]:
(99441, 5)
In [ ]:
 

geoloc

In [11]:
df_geoloc.shape
Out[11]:
(1000163, 5)
In [12]:
df_geoloc.head()
Out[12]:
geolocation_zip_code_prefix geolocation_lat geolocation_lng geolocation_city geolocation_state
0 1037 -23.545621 -46.639292 sao paulo SP
1 1046 -23.546081 -46.644820 sao paulo SP
2 1046 -23.546129 -46.642951 sao paulo SP
3 1041 -23.544392 -46.639499 sao paulo SP
4 1035 -23.541578 -46.641607 sao paulo SP
In [13]:
# Doublons ?
df_geoloc['geolocation_zip_code_prefix'].nunique()
Out[13]:
19015
In [14]:
df_geoloc[df_geoloc['geolocation_zip_code_prefix']== 1041].head()
Out[14]:
geolocation_zip_code_prefix geolocation_lat geolocation_lng geolocation_city geolocation_state
3 1041 -23.544392 -46.639499 sao paulo SP
29 1041 -23.543539 -46.640577 sao paulo SP
238 1041 -23.543713 -46.640263 sao paulo SP
312 1041 -23.543713 -46.640263 sao paulo SP
314 1041 -23.543539 -46.640577 sao paulo SP

On remarque qu'à un même code postal sont associées différents localisations ('geolocation_lat', 'geolocation_lng'). Nous allons prendre la moyenne des différentes positions géographiques correspondant à un même code postal.

In [15]:
# Regrouper les latitudes/longitudes par leurs moyennes
df_geoloc = df_geoloc.groupby('geolocation_zip_code_prefix') \
    .agg({'geolocation_lat': 'mean', 'geolocation_lng': 'mean',
          'geolocation_city': 'first', 'geolocation_state': 'first'}) \
    .reset_index()
# Vérification
df_geoloc['geolocation_zip_code_prefix'].nunique()
Out[15]:
19015
In [16]:
df_geoloc.head()
Out[16]:
geolocation_zip_code_prefix geolocation_lat geolocation_lng geolocation_city geolocation_state
0 1001 -23.550190 -46.634024 sao paulo SP
1 1002 -23.548146 -46.634979 sao paulo SP
2 1003 -23.548994 -46.635731 sao paulo SP
3 1004 -23.549799 -46.634757 sao paulo SP
4 1005 -23.549456 -46.636733 sao paulo SP
In [17]:
df_geoloc.geolocation_zip_code_prefix.unique()
Out[17]:
array([ 1001,  1002,  1003, ..., 99970, 99980, 99990])

Fusion les tables

In [18]:
def join_dfs(df1:pd.DataFrame, df2:pd.DataFrame, key:str, how:str='left') -> pd.DataFrame:
    print(f"{'The shape of df1:':<22}", df1.shape)
    print(f"{'The shape of df2:':<22}", df2.shape)
    df = df1.merge(df2, how=how, on=key)
    print("The shape of final df:", df.shape)
    return df

Jointure avec la table 'df_customers'

In [19]:
# change key name of geolocation
df_geoloc.rename(
    columns={"geolocation_zip_code_prefix": "customer_zip_code_prefix"}, inplace=True
)

# join consumers info dataframes
df_customers = join_dfs(
    df_customers,
    df_geoloc,
    key="customer_zip_code_prefix",
    how="left",
)
df_customers.head()
The shape of df1:      (99441, 5)
The shape of df2:      (19015, 5)
The shape of final df: (99441, 9)
Out[19]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng geolocation_city geolocation_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 franca SP
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 sao bernardo do campo SP
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 sao paulo SP
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 mogi das cruzes SP
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 campinas SP
In [20]:
df_customers[["customer_city", "geolocation_city"]][
    ~(df_customers.customer_city == df_customers.geolocation_city)
].sample(10)
Out[20]:
customer_city geolocation_city
26303 itapirapua itapirapuã
70915 itabera itaberá
54852 varzea paulista várzea paulista
71955 goiania goiânia
52228 brasilia NaN
42051 itaguai itaguaí
60739 niteroi niterói
55422 ribeirao pires ribeirão pires
27825 sao paulo são paulo
53890 sao paulo são paulo
In [21]:
df_customers[["customer_state", "geolocation_state"]][
    ~(df_customers.customer_state == df_customers.geolocation_state)
].sample(5)
Out[21]:
customer_state geolocation_state
59327 DF NaN
11238 DF NaN
36645 DF NaN
14629 DF NaN
66517 DF NaN

Y a une différence d'accent pour les 'city' et les 'state' contiennent quelques valeurs manquantes. On va garder celles de 'customer'

In [22]:
fig = px.scatter_mapbox(df_customers,
                        lat="geolocation_lat",
                        lon="geolocation_lng",
                        mapbox_style = "open-street-map",
                        hover_data = ["geolocation_state"],
                        zoom=3,
                        title="Répartition des clients selon leurs états")
fig.show()
fig.write_image("plots/clients_etat.png")

Le ecommerce brésilien a atteint le Portugal

In [23]:
# drop useless columns
df_customers.drop(columns=['geolocation_city', 'geolocation_state'], inplace=True)
df_customers.shape
Out[23]:
(99441, 7)

Order

In [24]:
df_orders.head()
Out[24]:
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 00:00:00
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13 00:00:00
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04 00:00:00
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15 00:00:00
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26 00:00:00
In [25]:
df_orders.isna().sum()
Out[25]:
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64
In [26]:
df_orders.order_status.unique()
Out[26]:
array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
       'canceled', 'created', 'approved'], dtype=object)
In [27]:
df_orders.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   order_id                       99441 non-null  object
 1   customer_id                    99441 non-null  object
 2   order_status                   99441 non-null  object
 3   order_purchase_timestamp       99441 non-null  object
 4   order_approved_at              99281 non-null  object
 5   order_delivered_carrier_date   97658 non-null  object
 6   order_delivered_customer_date  96476 non-null  object
 7   order_estimated_delivery_date  99441 non-null  object
dtypes: object(8)
memory usage: 6.1+ MB

Convert to datetime format

In [28]:
df_orders=pd.read_csv('Dataset/olist_orders_dataset.csv',parse_dates=['order_purchase_timestamp','order_approved_at','order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date'])
df_orders.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB
In [29]:
df_orders = join_dfs(df_orders, df_order_payments, key="order_id", how="left")
df_orders = join_dfs(df_orders, df_order_reviews, key="order_id", how="left")
df_orders = join_dfs(df_orders, df_order_items, key="order_id", how="left")
df_orders.head()
The shape of df1:      (99441, 8)
The shape of df2:      (103886, 5)
The shape of final df: (103887, 12)
The shape of df1:      (103887, 12)
The shape of df2:      (99224, 7)
The shape of final df: (104478, 18)
The shape of df1:      (104478, 18)
The shape of df2:      (112650, 7)
The shape of final df: (119143, 24)
Out[29]:
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp order_item_id product_id seller_id shipping_limit_date price freight_value
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 1.0 credit_card 1.0 18.12 a54f0611adc9ed256b57ede6b6eb5114 4.0 NaN Não testei o produto ainda, mas ele veio corre... 2017-10-11 00:00:00 2017-10-12 03:43:48 1.0 87285b34884572647811a353c7ac498a 3504c0cb71d7fa48d967e0e4c94d59d9 2017-10-06 11:07:15 29.99 8.72
1 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 3.0 voucher 1.0 2.00 a54f0611adc9ed256b57ede6b6eb5114 4.0 NaN Não testei o produto ainda, mas ele veio corre... 2017-10-11 00:00:00 2017-10-12 03:43:48 1.0 87285b34884572647811a353c7ac498a 3504c0cb71d7fa48d967e0e4c94d59d9 2017-10-06 11:07:15 29.99 8.72
2 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 2.0 voucher 1.0 18.59 a54f0611adc9ed256b57ede6b6eb5114 4.0 NaN Não testei o produto ainda, mas ele veio corre... 2017-10-11 00:00:00 2017-10-12 03:43:48 1.0 87285b34884572647811a353c7ac498a 3504c0cb71d7fa48d967e0e4c94d59d9 2017-10-06 11:07:15 29.99 8.72
3 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13 1.0 boleto 1.0 141.46 8d5266042046a06655c8db133d120ba5 4.0 Muito boa a loja Muito bom o produto. 2018-08-08 00:00:00 2018-08-08 18:37:50 1.0 595fac2a385ac33a80bd5114aec74eb8 289cdb325fb7e7f891c38608bf9e0962 2018-07-30 03:24:27 118.70 22.76
4 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04 1.0 credit_card 3.0 179.12 e73b67b67587f7644d5bd1a52deb1b01 5.0 NaN NaN 2018-08-18 00:00:00 2018-08-22 19:07:58 1.0 aa4383b373c6aca5d8797843e5594415 4869f7a5dfa277a7dca6462dcf3b52b2 2018-08-13 08:55:23 159.90 19.22
In [30]:
df = pd.DataFrame()
sum_missing_values = df_orders.isnull().sum()
percentage = 100 * sum_missing_values / len(df_orders)
df['% Missing Values'] = percentage.round(2)
df.style.background_gradient(axis=0)
Out[30]:
% Missing Values
order_id 0.000000
customer_id 0.000000
order_status 0.000000
order_purchase_timestamp 0.000000
order_approved_at 0.150000
order_delivered_carrier_date 1.750000
order_delivered_customer_date 2.870000
order_estimated_delivery_date 0.000000
payment_sequential 0.000000
payment_type 0.000000
payment_installments 0.000000
payment_value 0.000000
review_id 0.840000
review_score 0.840000
review_comment_title 88.260000
review_comment_message 57.830000
review_creation_date 0.840000
review_answer_timestamp 0.840000
order_item_id 0.700000
product_id 0.700000
seller_id 0.700000
shipping_limit_date 0.700000
price 0.700000
freight_value 0.700000
In [31]:
df_orders.drop_duplicates(subset="order_id", keep="last", inplace=True)
In [32]:
# join customers info dataframes df_orders
df_customers = join_dfs(df_customers, df_orders, key="customer_id", how="left")
df_customers.head()
The shape of df1:      (99441, 7)
The shape of df2:      (99441, 24)
The shape of final df: (99441, 30)
Out[32]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp order_item_id product_id seller_id shipping_limit_date price freight_value
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-16 15:22:12 2017-05-23 10:47:57 2017-05-25 10:35:35 2017-06-05 1.0 credit_card 2.0 146.87 88b8b52d46df026a9d1ad2136a59b30b 4.0 NaN NaN 2017-05-26 00:00:00 2017-05-30 22:34:40 1.0 a9516a079e37a9c9c36b9b78b10169e8 7c67e1448b00f6e969d365cea6b010ab 2017-05-22 15:22:12 124.99 21.88
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-12 20:58:32 2018-01-15 17:14:59 2018-01-29 12:41:19 2018-02-06 1.0 credit_card 8.0 335.48 02fc48a9efa3e3d0f1a8ea26507eeec3 5.0 NaN NaN 2018-01-30 00:00:00 2018-02-10 22:43:29 1.0 4aa6014eceb682077f9dc4bffebc05b0 b8bc237ba3788b23da09c0f1f3a3288c 2018-01-18 20:58:32 289.00 46.48
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-05-20 16:19:10 2018-06-11 14:31:00 2018-06-14 17:58:51 2018-06-13 1.0 credit_card 7.0 157.73 5ad6695d76ee186dc473c42706984d87 5.0 NaN NaN 2018-06-15 00:00:00 2018-06-15 12:10:59 1.0 bd07b66896d6f1494f5b86251848ced7 7c67e1448b00f6e969d365cea6b010ab 2018-06-05 16:19:10 139.94 17.79
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-13 17:29:19 2018-03-27 23:22:42 2018-03-28 16:04:25 2018-04-10 1.0 credit_card 1.0 173.30 059a801bb31f6aab2266e672cab87bc5 5.0 NaN NaN 2018-03-29 00:00:00 2018-04-02 18:36:47 1.0 a5647c44af977b148e0a3a4751a09e2e 7c67e1448b00f6e969d365cea6b010ab 2018-03-27 16:31:16 149.94 23.36
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-29 10:10:09 2018-07-30 15:16:00 2018-08-09 20:55:48 2018-08-15 1.0 credit_card 8.0 252.25 8490879d58d6c5d7773f2739a03f089a 5.0 a melhor nota O baratheon è esxelente Amo adoro o baratheon 2018-08-10 00:00:00 2018-08-17 01:59:52 1.0 9391a573abe00141c56e38d84d7d5b3b 4a3ca9315b744ce9f8e9374361493884 2018-07-31 10:10:09 230.00 22.25
In [33]:
# Fusion du nouveau dataframe avec "products" via la clé "product_id"
df_customers = join_dfs(df_customers, df_products, key="product_id", how="left")
df_customers.head()
The shape of df1:      (99441, 30)
The shape of df2:      (32951, 9)
The shape of final df: (99441, 38)
Out[33]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp order_item_id product_id seller_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-16 15:22:12 2017-05-23 10:47:57 2017-05-25 10:35:35 2017-06-05 1.0 credit_card 2.0 146.87 88b8b52d46df026a9d1ad2136a59b30b 4.0 NaN NaN 2017-05-26 00:00:00 2017-05-30 22:34:40 1.0 a9516a079e37a9c9c36b9b78b10169e8 7c67e1448b00f6e969d365cea6b010ab 2017-05-22 15:22:12 124.99 21.88 moveis_escritorio 41.0 1141.0 1.0 8683.0 54.0 64.0 31.0
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-12 20:58:32 2018-01-15 17:14:59 2018-01-29 12:41:19 2018-02-06 1.0 credit_card 8.0 335.48 02fc48a9efa3e3d0f1a8ea26507eeec3 5.0 NaN NaN 2018-01-30 00:00:00 2018-02-10 22:43:29 1.0 4aa6014eceb682077f9dc4bffebc05b0 b8bc237ba3788b23da09c0f1f3a3288c 2018-01-18 20:58:32 289.00 46.48 utilidades_domesticas 43.0 1002.0 3.0 10150.0 89.0 15.0 40.0
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-05-20 16:19:10 2018-06-11 14:31:00 2018-06-14 17:58:51 2018-06-13 1.0 credit_card 7.0 157.73 5ad6695d76ee186dc473c42706984d87 5.0 NaN NaN 2018-06-15 00:00:00 2018-06-15 12:10:59 1.0 bd07b66896d6f1494f5b86251848ced7 7c67e1448b00f6e969d365cea6b010ab 2018-06-05 16:19:10 139.94 17.79 moveis_escritorio 55.0 955.0 1.0 8267.0 52.0 52.0 17.0
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-13 17:29:19 2018-03-27 23:22:42 2018-03-28 16:04:25 2018-04-10 1.0 credit_card 1.0 173.30 059a801bb31f6aab2266e672cab87bc5 5.0 NaN NaN 2018-03-29 00:00:00 2018-04-02 18:36:47 1.0 a5647c44af977b148e0a3a4751a09e2e 7c67e1448b00f6e969d365cea6b010ab 2018-03-27 16:31:16 149.94 23.36 moveis_escritorio 48.0 1066.0 1.0 12160.0 56.0 51.0 28.0
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-29 10:10:09 2018-07-30 15:16:00 2018-08-09 20:55:48 2018-08-15 1.0 credit_card 8.0 252.25 8490879d58d6c5d7773f2739a03f089a 5.0 a melhor nota O baratheon è esxelente Amo adoro o baratheon 2018-08-10 00:00:00 2018-08-17 01:59:52 1.0 9391a573abe00141c56e38d84d7d5b3b 4a3ca9315b744ce9f8e9374361493884 2018-07-31 10:10:09 230.00 22.25 casa_conforto 61.0 407.0 1.0 5200.0 45.0 15.0 35.0
In [34]:
# Fusion du nouveau dataframe avec "trad" 
df_customers = join_dfs(df_customers, df_trad, key="product_category_name", how="left")
df_customers.head()
The shape of df1:      (99441, 38)
The shape of df2:      (71, 2)
The shape of final df: (99441, 39)
Out[34]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp order_item_id product_id seller_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-16 15:22:12 2017-05-23 10:47:57 2017-05-25 10:35:35 2017-06-05 1.0 credit_card 2.0 146.87 88b8b52d46df026a9d1ad2136a59b30b 4.0 NaN NaN 2017-05-26 00:00:00 2017-05-30 22:34:40 1.0 a9516a079e37a9c9c36b9b78b10169e8 7c67e1448b00f6e969d365cea6b010ab 2017-05-22 15:22:12 124.99 21.88 moveis_escritorio 41.0 1141.0 1.0 8683.0 54.0 64.0 31.0 office_furniture
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-12 20:58:32 2018-01-15 17:14:59 2018-01-29 12:41:19 2018-02-06 1.0 credit_card 8.0 335.48 02fc48a9efa3e3d0f1a8ea26507eeec3 5.0 NaN NaN 2018-01-30 00:00:00 2018-02-10 22:43:29 1.0 4aa6014eceb682077f9dc4bffebc05b0 b8bc237ba3788b23da09c0f1f3a3288c 2018-01-18 20:58:32 289.00 46.48 utilidades_domesticas 43.0 1002.0 3.0 10150.0 89.0 15.0 40.0 housewares
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-05-20 16:19:10 2018-06-11 14:31:00 2018-06-14 17:58:51 2018-06-13 1.0 credit_card 7.0 157.73 5ad6695d76ee186dc473c42706984d87 5.0 NaN NaN 2018-06-15 00:00:00 2018-06-15 12:10:59 1.0 bd07b66896d6f1494f5b86251848ced7 7c67e1448b00f6e969d365cea6b010ab 2018-06-05 16:19:10 139.94 17.79 moveis_escritorio 55.0 955.0 1.0 8267.0 52.0 52.0 17.0 office_furniture
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-13 17:29:19 2018-03-27 23:22:42 2018-03-28 16:04:25 2018-04-10 1.0 credit_card 1.0 173.30 059a801bb31f6aab2266e672cab87bc5 5.0 NaN NaN 2018-03-29 00:00:00 2018-04-02 18:36:47 1.0 a5647c44af977b148e0a3a4751a09e2e 7c67e1448b00f6e969d365cea6b010ab 2018-03-27 16:31:16 149.94 23.36 moveis_escritorio 48.0 1066.0 1.0 12160.0 56.0 51.0 28.0 office_furniture
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-29 10:10:09 2018-07-30 15:16:00 2018-08-09 20:55:48 2018-08-15 1.0 credit_card 8.0 252.25 8490879d58d6c5d7773f2739a03f089a 5.0 a melhor nota O baratheon è esxelente Amo adoro o baratheon 2018-08-10 00:00:00 2018-08-17 01:59:52 1.0 9391a573abe00141c56e38d84d7d5b3b 4a3ca9315b744ce9f8e9374361493884 2018-07-31 10:10:09 230.00 22.25 casa_conforto 61.0 407.0 1.0 5200.0 45.0 15.0 35.0 home_confort
In [35]:
df_customers = join_dfs(df_customers, df_sellers, key="seller_id", how="left")
df_customers.head()
The shape of df1:      (99441, 39)
The shape of df2:      (3095, 4)
The shape of final df: (99441, 42)
Out[35]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp order_item_id product_id seller_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english seller_zip_code_prefix seller_city seller_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-16 15:22:12 2017-05-23 10:47:57 2017-05-25 10:35:35 2017-06-05 1.0 credit_card 2.0 146.87 88b8b52d46df026a9d1ad2136a59b30b 4.0 NaN NaN 2017-05-26 00:00:00 2017-05-30 22:34:40 1.0 a9516a079e37a9c9c36b9b78b10169e8 7c67e1448b00f6e969d365cea6b010ab 2017-05-22 15:22:12 124.99 21.88 moveis_escritorio 41.0 1141.0 1.0 8683.0 54.0 64.0 31.0 office_furniture 8577.0 itaquaquecetuba SP
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-12 20:58:32 2018-01-15 17:14:59 2018-01-29 12:41:19 2018-02-06 1.0 credit_card 8.0 335.48 02fc48a9efa3e3d0f1a8ea26507eeec3 5.0 NaN NaN 2018-01-30 00:00:00 2018-02-10 22:43:29 1.0 4aa6014eceb682077f9dc4bffebc05b0 b8bc237ba3788b23da09c0f1f3a3288c 2018-01-18 20:58:32 289.00 46.48 utilidades_domesticas 43.0 1002.0 3.0 10150.0 89.0 15.0 40.0 housewares 88303.0 itajai SC
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-05-20 16:19:10 2018-06-11 14:31:00 2018-06-14 17:58:51 2018-06-13 1.0 credit_card 7.0 157.73 5ad6695d76ee186dc473c42706984d87 5.0 NaN NaN 2018-06-15 00:00:00 2018-06-15 12:10:59 1.0 bd07b66896d6f1494f5b86251848ced7 7c67e1448b00f6e969d365cea6b010ab 2018-06-05 16:19:10 139.94 17.79 moveis_escritorio 55.0 955.0 1.0 8267.0 52.0 52.0 17.0 office_furniture 8577.0 itaquaquecetuba SP
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-13 17:29:19 2018-03-27 23:22:42 2018-03-28 16:04:25 2018-04-10 1.0 credit_card 1.0 173.30 059a801bb31f6aab2266e672cab87bc5 5.0 NaN NaN 2018-03-29 00:00:00 2018-04-02 18:36:47 1.0 a5647c44af977b148e0a3a4751a09e2e 7c67e1448b00f6e969d365cea6b010ab 2018-03-27 16:31:16 149.94 23.36 moveis_escritorio 48.0 1066.0 1.0 12160.0 56.0 51.0 28.0 office_furniture 8577.0 itaquaquecetuba SP
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-29 10:10:09 2018-07-30 15:16:00 2018-08-09 20:55:48 2018-08-15 1.0 credit_card 8.0 252.25 8490879d58d6c5d7773f2739a03f089a 5.0 a melhor nota O baratheon è esxelente Amo adoro o baratheon 2018-08-10 00:00:00 2018-08-17 01:59:52 1.0 9391a573abe00141c56e38d84d7d5b3b 4a3ca9315b744ce9f8e9374361493884 2018-07-31 10:10:09 230.00 22.25 casa_conforto 61.0 407.0 1.0 5200.0 45.0 15.0 35.0 home_confort 14940.0 ibitinga SP
In [36]:
df_customers.drop(columns='seller_zip_code_prefix', inplace=True)
In [37]:
df_customers.shape
Out[37]:
(99441, 41)
In [38]:
df_customers.to_csv(('Dataset/olist_merged.csv'))

Importation nouveau df fusionner

In [39]:
olist_df = pd.read_csv('Dataset/olist_merged.csv', index_col=0,parse_dates=['order_purchase_timestamp','order_approved_at','order_delivered_carrier_date','order_delivered_customer_date','order_estimated_delivery_date','review_creation_date','review_answer_timestamp','shipping_limit_date'])
In [40]:
olist_df.shape
Out[40]:
(99441, 41)
In [41]:
olist_df.head()
Out[41]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp order_item_id product_id seller_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english seller_city seller_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP -20.498489 -47.396929 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-05-16 15:05:35 2017-05-16 15:22:12 2017-05-23 10:47:57 2017-05-25 10:35:35 2017-06-05 1.0 credit_card 2.0 146.87 88b8b52d46df026a9d1ad2136a59b30b 4.0 NaN NaN 2017-05-26 2017-05-30 22:34:40 1.0 a9516a079e37a9c9c36b9b78b10169e8 7c67e1448b00f6e969d365cea6b010ab 2017-05-22 15:22:12 124.99 21.88 moveis_escritorio 41.0 1141.0 1.0 8683.0 54.0 64.0 31.0 office_furniture itaquaquecetuba SP
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP -23.727992 -46.542848 29150127e6685892b6eab3eec79f59c7 delivered 2018-01-12 20:48:24 2018-01-12 20:58:32 2018-01-15 17:14:59 2018-01-29 12:41:19 2018-02-06 1.0 credit_card 8.0 335.48 02fc48a9efa3e3d0f1a8ea26507eeec3 5.0 NaN NaN 2018-01-30 2018-02-10 22:43:29 1.0 4aa6014eceb682077f9dc4bffebc05b0 b8bc237ba3788b23da09c0f1f3a3288c 2018-01-18 20:58:32 289.00 46.48 utilidades_domesticas 43.0 1002.0 3.0 10150.0 89.0 15.0 40.0 housewares itajai SC
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP -23.531642 -46.656289 b2059ed67ce144a36e2aa97d2c9e9ad2 delivered 2018-05-19 16:07:45 2018-05-20 16:19:10 2018-06-11 14:31:00 2018-06-14 17:58:51 2018-06-13 1.0 credit_card 7.0 157.73 5ad6695d76ee186dc473c42706984d87 5.0 NaN NaN 2018-06-15 2018-06-15 12:10:59 1.0 bd07b66896d6f1494f5b86251848ced7 7c67e1448b00f6e969d365cea6b010ab 2018-06-05 16:19:10 139.94 17.79 moveis_escritorio 55.0 955.0 1.0 8267.0 52.0 52.0 17.0 office_furniture itaquaquecetuba SP
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP -23.499702 -46.185233 951670f92359f4fe4a63112aa7306eba delivered 2018-03-13 16:06:38 2018-03-13 17:29:19 2018-03-27 23:22:42 2018-03-28 16:04:25 2018-04-10 1.0 credit_card 1.0 173.30 059a801bb31f6aab2266e672cab87bc5 5.0 NaN NaN 2018-03-29 2018-04-02 18:36:47 1.0 a5647c44af977b148e0a3a4751a09e2e 7c67e1448b00f6e969d365cea6b010ab 2018-03-27 16:31:16 149.94 23.36 moveis_escritorio 48.0 1066.0 1.0 12160.0 56.0 51.0 28.0 office_furniture itaquaquecetuba SP
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP -22.975100 -47.142925 6b7d50bd145f6fc7f33cebabd7e49d0f delivered 2018-07-29 09:51:30 2018-07-29 10:10:09 2018-07-30 15:16:00 2018-08-09 20:55:48 2018-08-15 1.0 credit_card 8.0 252.25 8490879d58d6c5d7773f2739a03f089a 5.0 a melhor nota O baratheon è esxelente Amo adoro o baratheon 2018-08-10 2018-08-17 01:59:52 1.0 9391a573abe00141c56e38d84d7d5b3b 4a3ca9315b744ce9f8e9374361493884 2018-07-31 10:10:09 230.00 22.25 casa_conforto 61.0 407.0 1.0 5200.0 45.0 15.0 35.0 home_confort ibitinga SP
In [42]:
olist_df.describe()
Out[42]:
customer_zip_code_prefix geolocation_lat geolocation_lng payment_sequential payment_installments payment_value review_score order_item_id price freight_value product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
count 99441.000000 99163.000000 99163.000000 99440.000000 99440.000000 99440.000000 98673.000000 98666.000000 98666.000000 98666.000000 97248.000000 97248.000000 97248.000000 98650.000000 98650.000000 98650.000000 98650.000000
mean 35137.474583 -21.191224 -46.175442 1.023451 2.914220 158.306291 4.086721 1.141731 125.883144 20.179411 48.841591 793.916780 2.249105 2103.522544 30.092560 16.481318 23.017506
std 29797.938996 5.608637 4.056067 0.235380 2.707625 219.892092 1.347322 0.538452 191.166764 15.855944 9.998595 654.073733 1.744160 3767.704025 16.124922 13.314891 11.738725
min 1003.000000 -33.689948 -72.668881 1.000000 0.000000 0.000000 1.000000 1.000000 0.850000 0.000000 5.000000 4.000000 1.000000 0.000000 7.000000 2.000000 6.000000
25% 11347.000000 -23.589378 -48.097950 1.000000 1.000000 60.090000 4.000000 1.000000 41.492500 13.280000 42.000000 350.000000 1.000000 300.000000 18.000000 8.000000 15.000000
50% 24416.000000 -22.924970 -46.630647 1.000000 2.000000 103.275000 5.000000 1.000000 79.000000 16.350000 52.000000 607.000000 2.000000 700.000000 25.000000 13.000000 20.000000
75% 58900.000000 -20.139828 -43.598897 1.000000 4.000000 175.080000 5.000000 1.000000 139.900000 21.220000 57.000000 995.250000 3.000000 1800.000000 38.000000 20.000000 30.000000
max 99990.000000 42.184003 -8.723762 22.000000 24.000000 13664.080000 5.000000 21.000000 6735.000000 409.680000 76.000000 3992.000000 20.000000 40425.000000 105.000000 105.000000 118.000000
In [43]:
olist_df[olist_df.duplicated()]
Out[43]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp order_item_id product_id seller_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english seller_city seller_state
Aucun doublon
In [44]:
olist_df[olist_df['review_id'].isnull() & olist_df['review_score'].isnull()]
Out[44]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp order_item_id product_id seller_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english seller_city seller_state
18 9b8ce803689b3562defaad4613ef426f 7f3a72e8f988c6e735ba118d54f47458 5416 sao paulo SP -23.560949 -46.688422 17825f24877a9289214c301ae0c9424b delivered 2017-05-11 13:48:47 2017-05-13 11:55:16 2017-05-15 15:30:02 2017-06-07 12:02:14 2017-05-30 1.0 credit_card 1.0 67.98 NaN NaN NaN NaN NaT NaT 1.0 99a4788cb24856965c36a24e339b6058 4a3ca9315b744ce9f8e9374361493884 2017-05-19 11:55:16 89.90 12.13 cama_mesa_banho 54.0 245.0 1.0 1383.0 50.0 10.0 40.0 bed_bath_table ibitinga SP
224 265655cf1f2dfd4fc630004d1fe5f3cb fc0a9229ea43fea6c9f9d79c06761a1e 13185 hortolandia SP -22.892792 -47.173849 44918cef2a112fc3f8e4a9b676e5ab23 shipped 2018-02-06 22:08:09 2018-02-06 22:15:42 2018-02-07 22:45:44 NaT 2018-03-12 1.0 credit_card 6.0 125.43 NaN NaN NaN NaN NaT NaT 1.0 d2f1978552a7604790fa5922f4345998 ac3508719a1d8f5b7614b798f70af136 2018-02-12 22:15:42 109.90 15.53 papelaria 58.0 1091.0 4.0 600.0 25.0 30.0 15.0 stationery canoas RS
227 d6b41b191f1f643ff583898156e049cf 3a62a4803c77457a96c996a9db36a662 28375 varre-sai RJ -20.928433 -41.867690 1237d70df58d4f9a8bd929b9943f9b8a delivered 2017-10-09 09:58:11 2017-10-10 04:06:11 2017-11-17 14:19:55 2017-12-05 16:47:02 2017-11-06 1.0 boleto 1.0 75.66 NaN NaN NaN NaN NaT NaT 1.0 3e6f2d0265ff6265eb639a8d3027f5a3 6219ba2359cfb97f2e38fd4c321eb309 2017-10-17 05:06:11 58.00 17.66 utilidades_domesticas 10.0 131.0 1.0 1283.0 33.0 22.0 32.0 housewares sao paulo SP
503 befe184e4eb181470e6063a1a368a093 6ac6902b78ed854e42f75c8e25234e8c 4704 sao paulo SP -23.623508 -46.688288 e91a3bc14b8d125d82f0334371c39926 delivered 2018-08-27 17:01:43 2018-08-27 17:10:27 2018-08-28 14:56:00 2018-08-29 20:04:52 2018-08-31 1.0 credit_card 1.0 53.48 NaN NaN NaN NaN NaT NaT 1.0 2715967684d6991f7501693bfe08c08b 0eaa7e33d73809f1f925f7d07c506cbd 2018-08-29 17:10:27 45.00 8.48 brinquedos 44.0 420.0 1.0 800.0 28.0 35.0 18.0 toys santo andre SP
552 3810ae52bde69bed0acd162b5c18aadb 0beec065ce2d16d3a8dbb22b87cf4c43 22631 rio de janeiro RJ -23.003657 -43.340759 0f503b220e9d1fd78562e41a427a0c6a delivered 2017-09-09 12:12:43 2017-09-09 12:25:21 2017-09-18 19:27:42 2017-10-06 22:21:51 2017-10-02 1.0 credit_card 10.0 480.40 NaN NaN NaN NaN NaT NaT 1.0 0e621085968d9a9585ed3c2c204825e2 94144541854e298c2d976cb893b81343 2017-09-14 12:25:21 459.90 20.50 beleza_saude 52.0 3939.0 2.0 200.0 20.0 15.0 20.0 health_beauty viamao RS
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
98809 5dc6f4b9453584208557163c26079be1 e4326d8c6d585d237b12bced2443c73a 80240 curitiba PR -25.448991 -49.288610 e723fd90e2d095c2762e5e800d759bfa shipped 2017-11-25 17:14:25 2017-11-25 17:32:04 2017-11-27 15:28:26 NaT 2017-12-21 1.0 credit_card 2.0 20.10 NaN NaN NaN NaN NaT NaT 1.0 99b5175198369d12de8feee13dafac16 08d2d642cf72b622b14dde1d2f5eb2f5 2017-12-01 17:31:28 5.00 15.10 beleza_saude 35.0 778.0 1.0 225.0 28.0 15.0 17.0 health_beauty marilia SP
99046 315f814caff07be0d7cf52346a07adf1 a3a53af62bbe94c89dd9f99b63283e6b 30730 belo horizonte MG -19.915305 -43.986883 dacce96767ea2ce331e38a5ebcac5f25 delivered 2018-01-13 18:32:59 2018-01-13 18:49:53 2018-01-19 14:52:00 2018-01-27 12:48:58 2018-02-09 1.0 credit_card 1.0 103.83 NaN NaN NaN NaN NaT NaT 1.0 14a174908a08ad7cfb56d3814d8e0ea5 972d0f9cf61b499a4812cf0bfa3ad3c4 2018-01-18 18:49:53 84.00 19.83 cama_mesa_banho 52.0 1027.0 1.0 1150.0 28.0 26.0 23.0 bed_bath_table brusque SC
99157 917c477d3c63db7af7578b82bb101559 871ca3b66113e2831103e55c820e3472 5069 sao paulo SP -23.514559 -46.706697 cdbb4d02acee4e30dcd2797d3ed1b510 delivered 2017-12-06 11:07:43 2017-12-08 08:31:37 2017-12-08 17:52:27 2017-12-13 14:22:45 2017-12-22 1.0 credit_card 1.0 93.78 NaN NaN NaN NaN NaT NaT 1.0 9df634af5c467f3b07f3d92f7765aa64 8bd0f31cf0a614c658f6763bd02dea69 2017-12-14 08:31:32 82.00 11.78 brinquedos 45.0 965.0 2.0 850.0 16.0 32.0 16.0 toys sao paulo SP
99304 c9eb9435ff687d924d0ea1741bd1a5a6 0d5cc93aefe2e66255f969fdd179c8b5 34000 nova lima MG -20.001543 -43.888103 80ef3cee33c2462dc72f8938e980a03e delivered 2018-06-23 21:46:52 2018-06-23 22:18:32 2018-06-28 11:22:00 2018-07-05 13:15:48 2018-07-20 1.0 credit_card 1.0 36.28 NaN NaN NaN NaN NaT NaT 1.0 9d0bb30eed80184666c8acad23921283 1bb2bdb95f4841f1bba2c0d2cd83d3c9 2018-07-01 22:18:32 19.95 16.33 livros_interesse_geral 26.0 2857.0 1.0 500.0 20.0 22.0 15.0 books_general_interest sao paulo SP
99322 10f6413b5b1071b6b46c72c05df6f4d6 baefaec72794d54a0aeaf7bb6203704b 13451 santa barbara d'oeste SP -22.745050 -47.420435 1f169866113310fa8ab239a0b4c7e8de delivered 2018-04-17 13:25:52 2018-04-18 06:32:05 2018-04-18 22:08:33 2018-04-24 20:47:39 2018-05-10 1.0 boleto 1.0 55.13 NaN NaN NaN NaN NaT NaT 1.0 9d0e471ed428c6b5c87c055d434b0cd6 0ea22c1cfbdc755f86b9b54b39c16043 2018-04-24 06:30:57 39.90 15.23 fashion_bolsas_e_acessorios 51.0 372.0 2.0 150.0 16.0 9.0 11.0 fashion_bags_accessories sete lagoas MG

768 rows × 41 columns

In [45]:
# Suppression des variables inutiles pour notre étude
olist_df = olist_df.drop(['review_comment_title','review_comment_message',
                          'review_creation_date','review_answer_timestamp',
                          'review_id','seller_id'],axis=1)
In [46]:
olist_df[olist_df['geolocation_lng'].isnull() & olist_df['geolocation_lat'].isnull()]
Out[46]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english seller_city seller_state
354 ecb1725b26e8b8c458181455dfa434ea b55a113bb84fc10eaf58c6d09ec69794 72300 brasilia DF NaN NaN 8e806cca085990fdc8483aeb130d1330 delivered 2017-11-24 13:33:42 2017-11-24 15:53:48 2017-11-27 21:22:45 2017-12-16 00:52:06 2017-12-20 1.0 credit_card 3.0 71.14 5.0 1.0 22c5ddc22ea8cae84fc9917def682dc6 2017-12-05 15:53:48 56.99 14.15 perfumaria 24.0 435.0 3.0 350.0 25.0 15.0 15.0 perfumery santo andre SP
382 bcf86029aeed4ed8bac0e16eb14c22f5 7cd7974c9f79f75b77f323878ef87f43 11547 cubatao SP NaN NaN 71932b1adae11f63e4a454bb85d4235a delivered 2017-04-21 22:16:56 2017-04-21 22:25:14 2017-04-25 15:01:02 2017-05-16 10:47:59 2017-05-11 1.0 credit_card 1.0 65.89 1.0 1.0 e54464e95c9fb80e4edefa0ba49133c1 2017-04-26 22:25:14 54.90 10.99 brinquedos 57.0 927.0 3.0 800.0 20.0 35.0 20.0 toys barueri SP
877 f4302056f0c58570522590f8181de2c7 67b05b597a66b5c449025000b9430abb 64605 picos PI NaN NaN d0bec96aad189992b278688279ba1511 delivered 2018-05-08 21:04:16 2018-05-08 21:15:28 2018-05-10 18:29:00 2018-05-21 13:06:35 2018-06-05 1.0 credit_card 4.0 99.90 5.0 1.0 53b36df67ebb7c41585e8d54d6772e08 2018-05-14 21:15:28 99.90 0.00 relogios_presentes 33.0 523.0 3.0 584.0 16.0 11.0 13.0 watches_gifts ribeirao preto SP
1218 03bbe0ce5c28e05f22917607db798818 8f3dca4306d5a89e4ae2c65c110603a2 72465 brasilia DF NaN NaN 056349f85a73d794119c4286c95a52de delivered 2017-03-03 14:21:58 2017-03-03 14:35:09 2017-03-03 16:08:50 2017-03-17 12:52:17 2017-03-28 1.0 credit_card 2.0 27.04 5.0 1.0 af35be35db4ad0dc288b571453337376 2017-03-09 14:35:09 10.99 16.05 moveis_decoracao 24.0 457.0 4.0 100.0 80.0 10.0 60.0 furniture_decor ibitinga SP
1272 ad4950aded55c2ea376be59506456d68 aa2b96dd03307ea6dc4b763c0b5f0b39 7729 caieiras SP NaN NaN 17afedb7e5fad837f54bacf5c66c99f8 delivered 2017-10-01 11:21:12 2017-10-01 11:35:13 2017-10-03 20:53:07 2017-10-10 21:52:04 2017-10-24 1.0 credit_card 6.0 70.91 4.0 1.0 016f3b29107ed03252e477b08445cec4 2017-10-05 11:35:13 54.90 16.01 automotivo 56.0 871.0 2.0 3500.0 25.0 5.0 25.0 auto ribeirao preto SP
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
97467 cf818420383856a129134f5f8343f7b8 795c495a65f983b242fb01bd507977c5 72338 brasilia DF NaN NaN e13f9d9f79f8325d4f77d26e7384e9fd delivered 2017-07-11 18:23:43 2017-07-13 03:04:39 2017-07-14 18:34:26 2017-07-25 02:53:28 2017-08-04 1.0 boleto 1.0 59.00 5.0 1.0 880be32f4db1d9f6e2bec38fb6ac23ab 2017-07-19 03:04:39 44.90 14.10 brinquedos 45.0 589.0 4.0 200.0 16.0 7.0 11.0 toys sao paulo SP
97780 67f3e907dce402e696b15f9308ff22ed 6f232f2f5c7f33b7bd9d794d2afacadd 68629 paragominas PA NaN NaN e2f630b53cb2637be372c3810b089868 shipped 2017-07-26 11:44:28 2017-07-27 11:25:25 2017-07-27 17:47:51 NaT 2017-08-29 1.0 boleto 1.0 175.32 2.0 1.0 4a0938478787f49dc07af734b86be96c 2017-08-02 11:25:25 149.00 26.32 relogios_presentes 60.0 508.0 7.0 250.0 16.0 2.0 11.0 watches_gifts sao paulo SP
98140 f792e419335df11d82c32efcfb09c51b c04c085b8e7573ba87b9ae1968d0985e 28530 sao sebastiao do paraiba RJ NaN NaN 84a80b02b3af075990fc7d9d2369f066 delivered 2017-10-15 19:33:54 2017-10-18 08:35:17 2017-10-19 22:27:47 2017-11-03 11:08:03 2017-11-10 1.0 credit_card 5.0 571.16 5.0 1.0 f24fac234abea1ab9560893ecc15a573 2017-10-24 08:35:17 540.00 31.16 automotivo 51.0 1001.0 1.0 3375.0 47.0 22.0 11.0 auto coxim MS
98878 78a11bb1fa72f556996b9a5b9bcd0629 e7536f62a200b415edd9491ac12a17fa 55863 siriji PE NaN NaN 19b6861ce084842fb4ec0cafc59deebe delivered 2018-05-15 13:01:12 2018-05-15 13:30:44 2018-05-15 19:03:00 2018-05-18 16:44:32 2018-06-12 1.0 credit_card 7.0 77.03 5.0 1.0 99ba5fbf3ae895fe3a9289acd0a5eb91 2018-05-18 13:14:43 39.99 37.04 telefonia 55.0 419.0 3.0 150.0 17.0 8.0 14.0 telephony campinas SP
99264 ff09fd7b29e7488a8d8a20badcd8befe 8c21dd8c37144807c601f99f2a209dfb 72587 brasilia DF NaN NaN 157de1652c032429a625d034be79eeeb delivered 2017-05-10 09:00:07 2017-05-10 09:10:18 2017-05-15 10:06:54 2017-05-22 16:47:57 2017-06-01 1.0 credit_card 3.0 39.09 5.0 1.0 cc696680ea0cefcd5d7d5d928afb8de8 2017-05-16 09:10:18 24.99 14.10 fashion_esporte 48.0 1108.0 2.0 200.0 27.0 14.0 13.0 fashion_sport osasco SP

278 rows × 35 columns

In [47]:
msno.matrix(olist_df)
Out[47]:
<AxesSubplot:>
In [48]:
plt.figure(figsize=(7, 5))
sns.countplot(y="review_score", data=olist_df)
plt.title("Répartition des scores des avis clients")
plt.ylabel("Score")
plt.show()
In [49]:
totalOrdersByState = olist_df.groupby('customer_state')['order_id'].nunique().sort_values(ascending=False)
totalOrdersByState

plt.figure(figsize=(15,5))
plt.title('total des commandes par Etats des clients')
sns.barplot(x=totalOrdersByState.index, y=totalOrdersByState)
Out[49]:
<AxesSubplot:title={'center':'total des commandes par Etats des clients'}, xlabel='customer_state', ylabel='order_id'>

Une grande partie des commandes se font depuis l'état de Sao Paolo. Ce qui est tout à fait possible car c'est l'état le plus peuplé du Brésil.

In [50]:
olist_df[olist_df.customer_state.str.contains('SP')]['customer_state'].value_counts()
Out[50]:
SP    41746
Name: customer_state, dtype: int64
In [51]:
plt.figure(figsize=(7, 5))
sns.countplot(y="order_status", data=olist_df)
plt.title("Répartition des statuts de commandes")
plt.ylabel("order_status")
plt.show()

Valeurs aberrantes

In [52]:
olist_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 99441 entries, 0 to 99440
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   customer_id                    99441 non-null  object        
 1   customer_unique_id             99441 non-null  object        
 2   customer_zip_code_prefix       99441 non-null  int64         
 3   customer_city                  99441 non-null  object        
 4   customer_state                 99441 non-null  object        
 5   geolocation_lat                99163 non-null  float64       
 6   geolocation_lng                99163 non-null  float64       
 7   order_id                       99441 non-null  object        
 8   order_status                   99441 non-null  object        
 9   order_purchase_timestamp       99441 non-null  datetime64[ns]
 10  order_approved_at              99281 non-null  datetime64[ns]
 11  order_delivered_carrier_date   97658 non-null  datetime64[ns]
 12  order_delivered_customer_date  96476 non-null  datetime64[ns]
 13  order_estimated_delivery_date  99441 non-null  datetime64[ns]
 14  payment_sequential             99440 non-null  float64       
 15  payment_type                   99440 non-null  object        
 16  payment_installments           99440 non-null  float64       
 17  payment_value                  99440 non-null  float64       
 18  review_score                   98673 non-null  float64       
 19  order_item_id                  98666 non-null  float64       
 20  product_id                     98666 non-null  object        
 21  shipping_limit_date            98666 non-null  datetime64[ns]
 22  price                          98666 non-null  float64       
 23  freight_value                  98666 non-null  float64       
 24  product_category_name          97248 non-null  object        
 25  product_name_lenght            97248 non-null  float64       
 26  product_description_lenght     97248 non-null  float64       
 27  product_photos_qty             97248 non-null  float64       
 28  product_weight_g               98650 non-null  float64       
 29  product_length_cm              98650 non-null  float64       
 30  product_height_cm              98650 non-null  float64       
 31  product_width_cm               98650 non-null  float64       
 32  product_category_name_english  97226 non-null  object        
 33  seller_city                    98666 non-null  object        
 34  seller_state                   98666 non-null  object        
dtypes: datetime64[ns](6), float64(16), int64(1), object(12)
memory usage: 29.3+ MB
In [53]:
# Date de commande > date à de paiement
olist_df[(olist_df['order_purchase_timestamp'] > olist_df['order_delivered_carrier_date'])]
Out[53]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english seller_city seller_state
937 a125a54036b56ceb4c73638eda0f18aa 19067e6acbb88e562f0f58eca762ac76 26015 nova iguacu RJ -22.744987 -43.439115 69aca49c98f05af05bec6fa1f87a3059 delivered 2018-07-20 13:19:19 2018-07-20 13:30:13 2018-07-20 12:01:00 2018-07-25 17:03:35 2018-08-14 1.0 credit_card 2.0 147.59 5.0 1.0 c848bbf7ec73272cce4df7509a71afde 2018-07-24 13:30:13 119.80 27.79 bebidas 27.0 467.0 1.0 1925.0 30.0 29.0 20.0 drinks pitanga PR
1366 b58a35cb4e1a1f9ea74dcf0b2639a9ac fd837fa3548f0e74f75081f38a34e56b 93510 novo hamburgo RS -29.682441 -51.123471 db090a16182b263b1e896bb26c6f66cf delivered 2018-07-13 16:14:08 2018-07-13 16:44:09 2018-07-13 13:59:00 2018-07-27 11:34:31 2018-08-06 1.0 credit_card 2.0 46.82 5.0 1.0 4c7d4a2efde21e3bf1660926fabc6a9d 2018-07-18 16:31:30 31.50 15.32 beleza_saude 60.0 316.0 3.0 250.0 18.0 10.0 15.0 health_beauty sao paulo SP
1488 3088cc4bfafe311f1d84830808c2aa6d 2f0def88a867b9c0aabb60c561b4d640 9991 diadema SP -23.695094 -46.612240 f7780ea2807db31691e83f0013294035 delivered 2018-07-30 15:22:15 2018-07-30 15:35:16 2018-07-30 15:00:00 2018-08-02 18:32:30 2018-08-02 1.0 credit_card 1.0 67.67 5.0 1.0 904a9e449fbfd90da738ab70ae6ca003 2018-08-01 15:35:16 59.99 7.68 eletronicos 43.0 742.0 2.0 283.0 18.0 9.0 15.0 electronics sao paulo SP
2375 e489861e700dfc79e5347bbf75b472e0 44191aeadca96c5a073edb7f09ceee05 13495 iracemapolis SP -22.586590 -47.514345 3b40ede283844ba0fbdd110f8a56792f delivered 2018-07-18 14:35:41 2018-07-18 14:45:20 2018-07-18 13:06:00 2018-07-25 17:27:25 2018-08-01 1.0 credit_card 1.0 70.07 5.0 1.0 703d742887bb9267f89b675608ba7aa0 2018-07-20 14:45:20 56.10 13.97 cama_mesa_banho 35.0 509.0 2.0 950.0 35.0 6.0 45.0 bed_bath_table ibitinga SP
3196 52f41b570c31d5f465eea3427b30334e fc0334b0e1e56824d1886ffc4b7d2f36 17602 tupa SP -21.930548 -50.498348 c6f11a51ee3384a11e50cc55dc685c24 delivered 2018-08-15 14:02:44 2018-08-15 14:24:18 2018-08-15 13:56:00 2018-08-21 10:15:44 2018-08-28 1.0 credit_card 6.0 133.84 4.0 1.0 fcaab5d7f656094e49fbe4ee3a506658 2018-08-20 14:24:18 109.00 24.84 papelaria 60.0 234.0 4.0 7650.0 28.0 55.0 37.0 stationery campo limpo paulista SP
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
92713 eb7da1cd8b8d48a43fdb3d86f34beff4 41bcd3882167cab0e3e3dafb0a292473 91430 porto alegre RS -30.050784 -51.142749 692f4207d359fbf24d641b1470721130 delivered 2018-06-04 12:10:06 2018-06-04 12:31:17 2018-06-04 11:04:00 2018-06-21 21:56:34 2018-07-17 1.0 credit_card 3.0 92.10 4.0 1.0 f0d56a1df2c0a7a756d7e4352cdcc8f0 2018-06-12 12:30:58 74.00 18.10 ferramentas_jardim 50.0 147.0 3.0 2300.0 67.0 2.0 41.0 garden_tools santo andre SP
94524 8e6ca829706ccbf262d3cfcebe7ceb52 344dcfca74b8df7ab46e9f1846570515 64017 teresina PI -5.108520 -42.793093 ab815320404ada153a9d06f166c961dc delivered 2018-05-17 09:48:09 2018-05-17 10:38:29 2018-05-17 09:45:00 2018-05-30 21:12:35 2018-06-13 1.0 credit_card 1.0 66.85 4.0 1.0 17552c3e04bda2b08ac85c9c55137dc3 2018-05-29 10:31:23 44.00 22.85 eletronicos 51.0 1290.0 3.0 200.0 16.0 4.0 11.0 electronics sao paulo SP
96903 1e2805c0173170a09ce422b2dab60c30 8929b9f8e7f1a900939386dcf9933241 95660 tres coroas RS -29.511356 -50.781151 da383ff6aa9558f3bd909b1d76789381 delivered 2018-08-20 13:56:52 2018-08-20 16:11:39 2018-08-20 13:55:00 2018-08-27 23:32:23 2018-08-31 1.0 credit_card 3.0 69.44 5.0 1.0 a412e6de2f924b291fcd8911e0f25eae 2018-08-22 16:11:39 49.90 19.54 brinquedos 20.0 345.0 1.0 760.0 30.0 28.0 21.0 toys londrina PR
97674 9bbcc57d44cde58fff7d4871e3c513d1 63a550721c8e5d6d69bee726f43ade8a 88332 balneario camboriu SC -27.014997 -48.605393 5a339130640419f9e00ab4b6ff61dc22 delivered 2018-04-27 15:59:23 2018-04-27 16:15:25 2018-04-27 14:35:00 2018-05-02 15:32:39 2018-05-16 1.0 credit_card 5.0 163.65 5.0 1.0 25f86162fee18735fffdb762dcb10d7c 2018-05-07 16:15:25 149.90 13.75 cama_mesa_banho 58.0 2400.0 2.0 10800.0 54.0 27.0 36.0 bed_bath_table joinville SC
98224 bf75820404561a3f35085c58a99a63fd 42d00e20578ce3c693acefa74c150cc8 31080 belo horizonte MG -19.884867 -43.909297 ffcc17a2f3331d4d25cf692060921690 delivered 2018-06-29 14:41:28 2018-06-29 14:50:16 2018-06-29 14:40:00 2018-07-03 16:14:35 2018-07-13 1.0 credit_card 1.0 103.36 5.0 1.0 42ec84ace63b58b8c5a7ba7be01d5fb8 2018-07-03 14:50:16 89.90 13.46 cool_stuff 52.0 1732.0 10.0 2550.0 24.0 23.0 27.0 cool_stuff belo horizonte MG

166 rows × 35 columns

In [54]:
olist_df = olist_df[~(olist_df['order_purchase_timestamp'] > olist_df['order_delivered_carrier_date'])]
olist_df.shape
Out[54]:
(99275, 35)
In [55]:
# Date de commande > date à laquelle la commande a été traitée
olist_df[(olist_df['order_purchase_timestamp'] > olist_df['order_approved_at'])]
Out[55]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english seller_city seller_state
In [56]:
# Date de commande > date de livraison
olist_df[(olist_df['order_purchase_timestamp'] > olist_df['order_delivered_customer_date'])]
Out[56]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm product_category_name_english seller_city seller_state

Outliers

In [57]:
integer_col  = olist_df.select_dtypes(include=[np.float, np.integer]).columns
In [58]:
integer_col[1::]
Out[58]:
Index(['geolocation_lat', 'geolocation_lng', 'payment_sequential',
       'payment_installments', 'payment_value', 'review_score',
       'order_item_id', 'price', 'freight_value', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'],
      dtype='object')
In [59]:
# Figure
plt.figure(figsize = (11,7))
x = olist_df[integer_col[1::]]
sns.set(style='whitegrid')
sns.boxplot(data=x)

# Add labels
plt.title("Distribution de quelques valeurs numériques clients")
plt.xlabel("")
plt.ylabel('Données clients')
plt.xticks(rotation=45)
plt.show()
In [60]:
plt.figure(figsize=(7, 5))
sns.countplot(y="product_photos_qty", data=olist_df)
plt.title("Nombre de photos par produits")
plt.ylabel("product_photos_qty")
plt.show()
In [61]:
payment = olist_df.groupby(['payment_type']).count()[
    'customer_city'].sort_values(ascending=False)
plt.figure(figsize=(10,4))
plt.title('Répartition des moyens de paiement')
sns.barplot(x = payment.values,
           y = payment.index)
Out[61]:
<AxesSubplot:title={'center':'Répartition des moyens de paiement'}, ylabel='payment_type'>
In [62]:
plt.figure(figsize=(10,10))
sns.heatmap(olist_df.corr())
Out[62]:
<AxesSubplot:>

Variable quantitatives

In [63]:
integer_col = olist_df.select_dtypes(include=[np.float, np.integer]).columns
In [64]:
# Distribution des variables quantitatives
for n in range(olist_df[integer_col].shape[1]):
    plt.figure(figsize=(16, 8))
    plot = sns.histplot(x=olist_df[integer_col].iloc[:, n], bins=100)
    plt.title(f"Distribution de {olist_df[integer_col].iloc[:, n].name}",
              fontsize=16)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.savefig(f"{olist_df[integer_col].iloc[:, n].name}_dist.png",
                bbox_inches='tight')
    plt.show()
In [65]:
olist_df.columns
Out[65]:
Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'geolocation_lat', 'geolocation_lng',
       'order_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'review_score', 'order_item_id', 'product_id',
       'shipping_limit_date', 'price', 'freight_value',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'product_category_name_english', 'seller_city', 'seller_state'],
      dtype='object')
In [66]:
olist_df.shape
Out[66]:
(99275, 35)
In [67]:
prod = olist_df[olist_df['product_name_lenght']>65]
In [68]:
prod.shape
Out[68]:
(15, 35)
In [69]:
fig = px.pie(prod, values='product_name_lenght', names='product_category_name',title='Top 5 categories par leur taille de nom')
fig.show()
fig.write_image("plots/product_length.png")
In [70]:
fig = px.pie(prod, values='product_name_lenght', names='product_category_name_english',title='Top 5 categories par leur taille de nom')
fig.show()
fig.write_image("plots/product_En_length.png")
In [71]:
px.pie(prod, values='product_name_lenght', names='payment_value',title='Répatition des produits par leur taille et prix')
In [72]:
plt.figure(figsize=(20, 10))
sns.heatmap(abs(olist_df[integer_col].corr()), annot=True, cmap="YlGnBu",
            annot_kws={"fontsize":12})
plot.set_xlabel(' ', fontsize=12)
plot.set_ylabel(f" ", fontsize=12)
plot.tick_params(labelsize=12)
plt.savefig(f"plots/corr_table_plot.png", bbox_inches="tight")

Variable Catégorielles

In [73]:
cat_col  = olist_df.select_dtypes(include=[np.object]).columns
In [74]:
cat_col = ['customer_state','order_status', 'payment_type', 'product_category_name','product_category_name_english', 'seller_state']
In [75]:
for n in range(olist_df[cat_col].shape[1]):
    plt.figure(figsize=(10, 20))
    plot = sns.countplot(y=olist_df[cat_col].iloc[:, n], palette='Blues_r')
    plt.title(f"Distribution de {olist_df[cat_col].iloc[:,n].name}",
              fontsize=12)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()
In [76]:
colASupprimer = ['product_category_name', 'product_name_lenght',
                 'product_description_lenght', 'product_photos_qty', 'product_weight_g',
                 'product_length_cm', 'product_height_cm', 'product_width_cm']
In [77]:
olist_df.drop(colASupprimer, axis=1, inplace=True)
olist_df = olist_df.rename(columns={'product_category_name_english': 'product_category_name'})
In [78]:
plt.figure(figsize=(16, 10))
plt.title('Nombre de produit vendu/acheté par catégorie', fontsize = 20)
sns.barplot(x = olist_df['product_category_name'].value_counts().index,
            y = olist_df['product_category_name'].value_counts().values)
plt.xlabel("Nom des différentes catégories", fontsize = 12)
plt.ylabel("Nombre de produit", fontsize = 12)
plt.tick_params(axis = 'both', labelsize = 12)
plt.xticks(rotation=90)
plt.show()
In [79]:
print("Number of unique categories: ", len(olist_df.product_category_name.unique()))
Number of unique categories:  72
In [80]:
plt.figure(figsize=(10,6))
top_25_prod_categories = olist_df.groupby('product_category_name')['product_id'].count().sort_values(ascending=False).head(25)
sns.barplot(x=top_25_prod_categories.index, y=top_25_prod_categories.values)
plt.xticks(rotation=90)
plt.xlabel('Product Category')
plt.title('Top 25 des catégories');
plt.show()

Variables temporelles

In [81]:
temp_col  = olist_df.select_dtypes(exclude=[np.object, np.integer, np.float]).columns
In [82]:
for n in range(olist_df[temp_col].shape[1]):
    plt.figure(figsize=(16, 10))
    plot = sns.histplot(x=olist_df[temp_col].iloc[:, n], bins=100)
    plt.title(f"Distribution de {olist_df[temp_col].iloc[:, n].name}",
              fontsize=20)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.savefig(f"{olist_df[temp_col].iloc[:, n].name}_dist.png",
                bbox_inches='tight')
    plt.show()
In [83]:
olist_df.columns
Out[83]:
Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'geolocation_lat', 'geolocation_lng',
       'order_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'payment_sequential', 'payment_type', 'payment_installments',
       'payment_value', 'review_score', 'order_item_id', 'product_id',
       'shipping_limit_date', 'price', 'freight_value',
       'product_category_name', 'seller_city', 'seller_state'],
      dtype='object')
In [84]:
start_date = '31-12-2018'
In [85]:
mask = (olist_df['shipping_limit_date'] > start_date)
In [86]:
df = olist_df.loc[mask]
df
Out[86]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state
9742 22e922696a7d1ab9a19c6b702fedc387 b030929cf3b8c3370ea8c611f9ccb32e 71905 brasilia DF NaN NaN 9c94a4ea2f7876660fa6f1b59b69c8e6 shipped 2017-03-14 19:23:22 2017-03-14 19:23:22 2017-03-16 14:31:15 NaT 2017-08-04 1.0 boleto 1.0 90.69 1.0 1.0 282b126b2354516c5f400154398f616d 2020-02-03 20:23:22 75.99 14.70 housewares jacutinga MG
25211 b279a1d441c73c1974d7a63618855aa0 3977f83a14549e6265bcded84e92ee80 21210 rio de janeiro RJ -22.843731 -43.302427 13bdf405f961a6deec817d817f5c6624 canceled 2017-03-16 02:30:51 2017-03-16 02:30:51 NaT NaT 2017-08-08 1.0 boleto 1.0 84.65 1.0 1.0 96ea060e41bdecc64e2de00b97068975 2020-02-05 03:30:51 69.99 14.66 housewares jacutinga MG
41582 6357fffb5704244d552615bbfcea1442 44b6bbfea26596437062a38c8e6bcec1 15040 sao jose do rio preto SP -20.799615 -49.415358 c2bb89b5c1dd978d507284be78a04cb2 delivered 2017-05-23 22:28:36 2017-05-24 22:35:08 2017-05-29 02:03:28 2017-06-09 13:35:54 2017-10-11 1.0 credit_card 15.0 322.86 5.0 2.0 87b92e06b320e803d334ac23966c80b1 2020-04-09 22:35:08 99.99 61.44 housewares jacutinga MG
In [87]:
df = pd.DataFrame()
sum_missing_values = olist_df.isnull().sum()
percentage = 100 * sum_missing_values / len(olist_df)
df['% Missing Values'] = percentage.round(2)
df.style.background_gradient(axis=0)
Out[87]:
% Missing Values
customer_id 0.000000
customer_unique_id 0.000000
customer_zip_code_prefix 0.000000
customer_city 0.000000
customer_state 0.000000
geolocation_lat 0.280000
geolocation_lng 0.280000
order_id 0.000000
order_status 0.000000
order_purchase_timestamp 0.000000
order_approved_at 0.160000
order_delivered_carrier_date 1.800000
order_delivered_customer_date 2.990000
order_estimated_delivery_date 0.000000
payment_sequential 0.000000
payment_type 0.000000
payment_installments 0.000000
payment_value 0.000000
review_score 0.770000
order_item_id 0.780000
product_id 0.780000
shipping_limit_date 0.780000
price 0.780000
freight_value 0.780000
product_category_name 2.230000
seller_city 0.780000
seller_state 0.780000

Le taux des valeurs manquantes est faible , nous pouvons supprimer les lignes contenant des valeurs manquante.

In [88]:
olist_df = olist_df.dropna()
olist_df.shape
Out[88]:
(94023, 27)
In [89]:
olist_df["review_score"] = olist_df["review_score"].astype("int32")
In [90]:
olist_df[olist_df['order_status'] == 'delivered'].shape[0]/ olist_df.shape[0] * 100
Out[90]:
99.99361858268722

On remarque qu'il y a une petite pourcentage de commande qui n'ont pas la statut délivré. On va donc se centré que sur les produits délivré.

In [91]:
olist_df = olist_df[olist_df['order_status'] == 'delivered']
In [92]:
olist_df.describe().transpose()
Out[92]:
count mean std min 25% 50% 75% max
customer_zip_code_prefix 94017.0 35109.686004 29829.252868 1004.000000 11320.000000 24355.000000 58570.000000 99980.000000
geolocation_lat 94017.0 -21.204017 5.597764 -33.689948 -23.590310 -22.926257 -20.140360 42.184003
geolocation_lng 94017.0 -46.191210 4.055096 -72.668881 -48.118347 -46.632826 -43.630451 -8.723762
payment_sequential 94017.0 1.023155 0.225119 1.000000 1.000000 1.000000 1.000000 17.000000
payment_installments 94017.0 2.913707 2.707436 0.000000 1.000000 2.000000 4.000000 24.000000
payment_value 94017.0 157.188860 215.766201 0.000000 60.000000 102.990000 174.810000 13664.080000
review_score 94017.0 4.157333 1.283149 1.000000 4.000000 5.000000 5.000000 5.000000
order_item_id 94017.0 1.141666 0.535263 1.000000 1.000000 1.000000 1.000000 21.000000
price 94017.0 125.117245 188.315476 0.850000 41.500000 79.000000 139.900000 6735.000000
freight_value 94017.0 20.155780 15.771605 0.000000 13.260000 16.380000 21.230000 409.680000
In [93]:
# Plot categories count in dataset
fig = plt.figure(figsize=(18, 8))
sns.countplot(data=olist_df, x="product_category_name")
plt.xticks(range(0, olist_df["product_category_name"].nunique()),
           olist_df["product_category_name"].unique(),
           rotation=90)
plt.title(f"Les catégories produits les plus représentées\n")
plt.show()

On va regrouper plusieurs catégories en une catégorie

  • Home Decor
  • Food and drinks
  • Beauty
  • Media
  • Fashion
  • Sport
  • Other
  • Technology
  • Travel
  • Baby
  • Art
In [94]:
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('fashio|watch')] = 'Fashion'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('health|beauty|perfum')] = 'Beauty'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('home|furniture|garden|bath|house|cuisine|flower')] = 'Home Decor'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('book')] = 'Books'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('phon|compu|tablet|electro|consol|security|Tech|appliance|air')] = 'Electronics and Tech'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('baby|diaper')] = 'Baby'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('grocer|food|drink')] = 'Food and drinks'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('media|cd|audio|musi|dvd|cine')] = 'Media'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('art')] = 'Art'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('construction|costruction')] = 'Construction'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('commerce|market_place')] = 'Industry and Commerce'
olist_df['product_category_name'].loc[olist_df['product_category_name'].str.contains('auto|stati|chris|cool|Art')] = 'Miscellaneous'
/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py:1637: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [95]:
# Plot categories count in dataset
fig = plt.figure(figsize=(18, 8))
sns.countplot(data=olist_df, x="product_category_name")
plt.xticks(range(0, olist_df["product_category_name"].nunique()),
           olist_df["product_category_name"].unique(),
           rotation=90)
plt.title(f"Les catégories produits")
plt.show()

Analyse sur un customer

In [96]:
cust_many_orders = olist_df[olist_df['customer_unique_id'] == '8d50f5eadf50201ccdcedfb9e2ac8455']
In [97]:
cust_many_orders
Out[97]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state geolocation_lat geolocation_lng order_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date payment_sequential payment_type payment_installments payment_value review_score order_item_id product_id shipping_limit_date price freight_value product_category_name seller_city seller_state
14186 1bd3585471932167ab72a84955ebefea 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 b850a16d8faf65a74c51287ef34379ce delivered 2017-11-22 20:01:53 2017-11-22 20:12:32 2017-11-24 16:07:56 2017-11-27 18:49:13 2017-12-04 1.0 credit_card 1.0 21.77 5 1.0 5bccc8fe44471df6c7b7e08176837d2f 2017-11-28 20:12:32 13.99 7.78 sports_leisure sao paulo SP
15321 a8fabc805e9a10a3c93ae5bff642b86b 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 23427a6bd9f8fd1b51f1b1e5cc186ab8 delivered 2018-05-21 22:44:31 2018-05-22 01:53:35 2018-05-22 14:18:00 2018-05-23 15:33:09 2018-05-29 1.0 credit_card 1.0 53.38 5 1.0 5cb96c51c55f57503465e4d2558dc053 2018-05-24 01:53:35 45.99 7.39 sports_leisure sao paulo SP
16654 897b7f72042714efaa64ac306ba0cafc 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 c2213109a2cc0e75d55585b7aaac6d97 delivered 2018-08-07 23:32:14 2018-08-07 23:45:21 2018-08-09 13:35:00 2018-08-10 20:26:44 2018-08-13 1.0 credit_card 1.0 34.44 5 1.0 de7df82eb1e48c456b020c2cfb03aeed 2018-08-12 23:45:21 26.99 7.45 sports_leisure sao paulo SP
36122 b2b13de0770e06de50080fea77c459e6 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 369634708db140c5d2c4e365882c443a delivered 2017-06-18 22:56:48 2017-06-18 23:10:19 2017-06-19 20:12:26 2017-06-23 12:55:50 2017-07-07 1.0 credit_card 2.0 51.75 5 1.0 d83509907a19c72e1e4cdde78b8177ec 2017-06-22 23:10:19 39.90 11.85 sports_leisure votuporanga SP
38073 42dbc1ad9d560637c9c4c1533746f86d 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 bf92c69b7cc70f7fc2c37de43e366173 delivered 2017-07-24 22:11:50 2017-07-24 22:25:14 2017-07-26 01:42:03 2017-07-31 16:59:58 2017-08-15 1.0 credit_card 3.0 166.71 5 1.0 e01bc2ad5c6f46c0f53f73379fcd602e 2017-07-30 22:25:14 149.90 16.81 sports_leisure joinville SC
40141 dfb941d6f7b02f57a44c3b7c3fefb44b 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 4f62d593acae92cea3c5662c76122478 delivered 2017-07-18 23:10:58 2017-07-18 23:23:26 2017-07-20 19:00:02 2017-07-21 16:19:40 2017-07-31 1.0 credit_card 1.0 21.77 5 1.0 94cc774056d3f2b0dc693486a589025e 2017-07-24 23:23:26 13.99 7.78 Fashion sao paulo SP
48614 65f9db9dd07a4e79b625effa4c868fcb 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 112eb6f37f1b9dabbced368fbbc6c9ef delivered 2018-07-23 21:53:02 2018-07-24 10:31:34 2018-07-25 10:25:00 2018-07-26 18:29:28 2018-08-02 1.0 credit_card 1.0 107.85 5 1.0 41f6cb7c3b1200749326e50106f32d58 2018-07-30 09:43:43 99.00 8.85 sports_leisure sao paulo SP
58707 a682769c4bc10fc6ef2101337a6c83c9 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 d3582fd5ccccd9cb229a63dfb417c86f delivered 2018-08-20 19:14:26 2018-08-20 19:30:05 2018-08-21 15:11:00 2018-08-24 14:08:43 2018-09-04 1.0 credit_card 1.0 38.66 5 1.0 df473738565b52f77b4e22b328b41576 2018-08-22 19:30:05 23.40 15.26 Construction curitiba PR
67996 6289b75219d757a56c0cce8d9e427900 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 6bdf325f0966e3056651285c0aed5aad delivered 2018-05-22 23:08:55 2018-05-22 23:36:01 2018-05-23 19:02:00 2018-05-24 11:58:23 2018-05-30 1.0 credit_card 1.0 62.95 5 1.0 d6354128c28cc56532ba7393d9373083 2018-05-24 23:31:13 51.80 11.15 sports_leisure campinas SP
72745 3414a9c813e3ca02504b8be8b2deb27f 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 a1a6eadae352582010f449d8527b55f3 delivered 2018-08-18 12:39:19 2018-08-18 12:50:37 2018-08-20 14:43:00 2018-08-24 17:26:42 2018-09-03 1.0 credit_card 2.0 96.40 5 1.0 426f910ccd39ae5e7d55013c466fe343 2018-08-23 12:50:37 72.90 23.50 sports_leisure londrina PR
83363 f5188d99e9281e214a4a7d1b139a8229 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 838f8e96cf584a1d8d22eaf7143c7772 delivered 2018-07-04 22:35:13 2018-07-05 16:27:55 2018-07-06 10:15:00 2018-07-10 21:27:30 2018-07-17 1.0 credit_card 1.0 30.41 5 1.0 fe7e33eaa09b5a4a3d3aa477280db875 2018-07-08 23:31:20 22.99 7.42 Fashion sao paulo SP
85507 89be66634d68fa73a95499b6352e085d 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 cd4b336a02aacabd0ef22f6db711f95e delivered 2017-10-18 23:25:04 2017-10-19 00:36:08 2017-10-20 17:11:50 2017-10-23 18:33:01 2017-10-30 1.0 credit_card 1.0 37.77 5 1.0 9169f70b7ce1138ac35678432d1e5ff2 2017-10-25 00:36:08 29.99 7.78 sports_leisure sao paulo SP
90268 0bf8bf19944a7f8b40ba86fef778ca7c 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 e3071b7624445af6e4f3a1b23718667d delivered 2017-09-05 22:14:52 2017-09-05 22:30:56 2017-09-06 15:26:12 2017-09-11 13:27:49 2017-09-22 1.0 credit_card 1.0 51.75 5 1.0 88159bac10e3d4f4b69b157406c7f70f 2017-09-12 22:30:56 39.90 11.85 sports_leisure sao jose do rio preto SP
93591 9a1afef458843a022e431f4cb304dfe9 8d50f5eadf50201ccdcedfb9e2ac8455 4045 sao paulo SP -23.618666 -46.639583 89d9b111d2b990deb5f5f9769f92800b delivered 2017-10-29 16:58:02 2017-10-29 17:10:09 2017-10-30 15:58:52 2017-10-31 15:33:47 2017-11-10 1.0 credit_card 1.0 21.77 5 1.0 94cc774056d3f2b0dc693486a589025e 2017-11-03 17:10:09 13.99 7.78 Fashion sao paulo SP
In [98]:
fig = px.scatter(cust_many_orders, x='product_category_name', y='price'
                 ,title='Customer with most purchase as per product category price')
fig.show()
fig.write_image("plots/one_cust_purchase.png")
In [99]:
fig = px.scatter(cust_many_orders, x='order_delivered_customer_date', y='order_purchase_timestamp'
           ,title='Customer with most purchase delivery dates')
fig.show()
fig.write_image("plots/one_cust_purchase_time.png")
In [100]:
olist_df.to_csv('Dataset/olist_final_cleaned.csv')
In [ ]: